The actual concrete compressive strength (MPa) for a given mixture under a specific age (days) was determined from laboratory. Data is in raw form (not scaled). The data has 8 quantitative input variables, and 1 quantitative output variable, and 1030 instances (observations).
Cement manufacturing
Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients. These ingredients include cement, blast furnace slag, fly ash, water, superplasticizer, coarse aggregate, and fine aggregate.
Modeling of strength of high performance concrete using Machine Learning
#for data manipulation
import pandas as pd
import numpy as np
#for ploting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
#data tranformation and feature generation
from scipy.stats import zscore,norm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor)
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
#importing the K fold
from sklearn.model_selection import KFold
#importing the cross validation score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from matplotlib import pyplot
#reading the data
data=pd.read_csv("concrete.csv")
data.head(10)
data.dtypes #to find the data types of each attributes
data.shape #no of rows and columns in the dataframe
There are 1030 rows and 9 columns in the given dataset
data.isnull().sum()
There are no missing values in the given dataset
data.describe().transpose()
summary=data.describe().transpose()
summary[['min','25%','50%','75%','max']]
data.skew(numeric_only = True)
Skewness with positive values indicates data is skewed towards right. Skewness with negative values indicates data is skewed towards left
# A quick check to find columns that contain outliers
fig=plt.figure(figsize=(15,7))
ax=sns.boxplot(data=data,orient='v')
From the above graph we found that the attributes other than cement,ash and coarseagg are having outliers
plt.figure(figsize=(20,20))
#boxplot
plt.subplot(3,3,1)
sns.boxplot(data.cement,showfliers=True,color="c").set_title("Distribution Of Cement")
# distplot
ax=plt.subplot(3,3,2)
sns.distplot(data.cement,color='m').set_title("cement Vs Frequency")
ax.axvline(data.cement.mean(),color='r',linestyle='--',label='Mean',linewidth=1.2)
ax.axvline(data.cement.median(),color='g',linestyle='--',label='Median',linewidth=1.2)
ax.legend(loc='best')
#histogram plot
plt.subplot(3,3,3)
data.cement.plot.hist(color='g').set_title("cement Vs Frequency");
plt.figure(figsize=(20,20))
#boxplot
plt.subplot(3,3,1)
sns.boxplot(data.slag,showfliers=True,color='c').set_title("Distribution of Slag")
#dist plot
ax=plt.subplot(3,3,2)
sns.distplot(data.slag,color='m').set_title("slag Vs Frequency")
ax.axvline(data.slag.mean(),color='r',linestyle='--',label='Mean',linewidth=1.2)
ax.axvline(data.slag.median(),color='g',linestyle='--',label='Median',linewidth=1.2)
ax.legend(loc='best')
#histogram plot
plt.subplot(3,3,3)
data.slag.plot.hist(color='g').set_title("slag Vs Frequency");
plt.figure(figsize=(20,20))
#boxplot
plt.subplot(3,3,1)
sns.boxplot(data.ash,showfliers=True,color='c').set_title("Distribution of Ash")
#dist plot
ax=plt.subplot(3,3,2)
sns.distplot(data.ash,color='m').set_title("ash Vs Frequency")
ax.axvline(data.ash.mean(),color='r',linestyle='--',label='Mean',linewidth=1.2)
ax.axvline(data.ash.median(),color='g',linestyle='--',label='Median',linewidth=1.2)
ax.legend(loc='best')
#histogram plot
plt.subplot(3,3,3)
data.ash.plot.hist(color='g').set_title("ash Vs Frequency");
plt.figure(figsize=(20,20))
#boxplot
plt.subplot(3,3,1)
sns.boxplot(data.water,showfliers=True,color='c').set_title("Distribution of water")
#dist plot
ax=plt.subplot(3,3,2)
sns.distplot(data.water,color='m').set_title("water Vs Frequency")
ax.axvline(data.water.mean(),color='r',linestyle='--',label='Mean',linewidth=1.2)
ax.axvline(data.water.median(),color='g',linestyle='--',label='Median',linewidth=1.2)
ax.legend(loc='best')
#histogram plot
plt.subplot(3,3,3)
data.water.plot.hist(color='g').set_title("water Vs Frequency");
plt.figure(figsize=(20,20))
#boxplot
plt.subplot(3,3,1)
sns.boxplot(data.superplastic,showfliers=True,color='c').set_title("Distribution of superplastic")
#dist plot
ax=plt.subplot(3,3,2)
sns.distplot(data.superplastic,color='m').set_title("superplastic Vs Frequency")
ax.axvline(data.superplastic.mean(),color='r',linestyle='--',label='Mean',linewidth=1.2)
ax.axvline(data.superplastic.median(),color='g',linestyle='--',label='Median',linewidth=1.2)
ax.legend(loc='best')
#histogram plot
plt.subplot(3,3,3)
data.superplastic.plot.hist(color='g').set_title("superplastic Vs Frequency");
plt.figure(figsize=(20,20))
#boxplot
plt.subplot(3,3,1)
sns.boxplot(data.coarseagg,showfliers=True,color='c').set_title("Distribution of coarseagg")
#dist plot
ax=plt.subplot(3,3,2)
sns.distplot(data.coarseagg,color='m').set_title("coarseagg Vs Frequency")
ax.axvline(data.coarseagg.mean(),color='r',linestyle='--',label='Mean',linewidth=1.2)
ax.axvline(data.coarseagg.median(),color='g',linestyle='--',label='Median',linewidth=1.2)
ax.legend(loc='best')
#histogram plot
plt.subplot(3,3,3)
data.coarseagg.plot.hist(color='g').set_title("coarseagg Vs Frequency");
plt.figure(figsize=(20,20))
#boxplot
plt.subplot(3,3,1)
sns.boxplot(data.fineagg,showfliers=True,color='c').set_title("Distribution of fineagg")
#dist plot
ax=plt.subplot(3,3,2)
sns.distplot(data.fineagg,color='m').set_title("fineagg Vs Frequency")
ax.axvline(data.fineagg.mean(),color='r',linestyle='--',label='Mean',linewidth=1.2)
ax.axvline(data.fineagg.median(),color='g',linestyle='--',label='Median',linewidth=1.2)
ax.legend(loc='best')
#histogram plot
plt.subplot(3,3,3)
data.fineagg.plot.hist(color='g').set_title("fineagg Vs Frequency");
plt.figure(figsize=(20,20))
#boxplot
plt.subplot(3,3,1)
sns.boxplot(data.age,showfliers=True,color='c').set_title("Distribution of age")
#dist plot
ax=plt.subplot(3,3,2)
sns.distplot(data.age,color='m').set_title("age Vs Frequency")
ax.axvline(data.age.mean(),color='r',linestyle='--',label='Mean',linewidth=1.2)
ax.axvline(data.age.median(),color='g',linestyle='--',label='Median',linewidth=1.2)
ax.legend(loc='best')
#histogram plot
plt.subplot(3,3,3)
data.age.plot.hist(color='g').set_title("age Vs Frequency");
agedata=data.copy()
def age_bin(data):
if data.age <= 30:
return '1 month'
if data.age > 30 and data.age <= 60 :
return '2 months'
if data.age > 60 and data.age <= 90 :
return '3 months'
if data.age > 90 and data.age <= 120 :
return '4 months'
if data.age > 120 and data.age <= 150 :
return '5 months'
if data.age > 150 and data.age <= 180 :
return '6 months'
if data.age > 180 and data.age <= 210 :
return '7 months'
if data.age > 210 and data.age <= 240 :
return '8 months'
if data.age > 240 and data.age <= 270 :
return '9 months'
if data.age > 270 and data.age <= 300 :
return '10 months'
if data.age > 300 and data.age <= 330 :
return '11 months'
if data.age > 330 :
return '12 months'
agedata['age_in_months'] = agedata.apply(lambda data:age_bin(data) , axis=1)
ax=plt.figure(figsize=(10, 6))
sns.countplot(agedata['age_in_months'], order = ['1 month', '2 months', '3 months', '4 months', '6 months', '9 months', '12 months'])
print(agedata['age_in_months'].value_counts())
plt.figure(figsize=(20,20))
#boxplot
plt.subplot(3,3,1)
sns.boxplot(data.strength,showfliers=True,color='c').set_title("Distribution of strength")
#dist plot
ax=plt.subplot(3,3,2)
sns.distplot(data.strength,color='m').set_title("strength Vs Frequency")
ax.axvline(data.strength.mean(),color='r',linestyle='--',label='Mean',linewidth=1.2)
ax.axvline(data.strength.median(),color='g',linestyle='--',label='Median',linewidth=1.2)
ax.legend(loc='best')
#histogram plot
plt.subplot(3,3,3)
data.strength.plot.hist(color='g').set_title("strength Vs Frequency");
for col in list(data.columns)[:-1]:
fig,ax1=plt.subplots(figsize=(15,7.5),ncols=1,sharex=False)
sns.regplot(x=data[col],y=data['strength'],ax=ax1).set_title(f'{col} Vs strength')
sns.pairplot(data,diag_kind='kde');
plt.figure(figsize=(25,25))
ax=sns.heatmap(data.corr(),vmax=.8,square=True,fmt='.2f',annot=True,linecolor='white',linewidths=0.01)
plt.title('Correlation of Attributes')
plt.show()
Cement does not have any significiant relationship with other Independent attributes But it is positively associated with the targest attribute strength, the relation is not very strong
Slag Does not have any significiant relation with any of the attributes
Ash Does not have any significiant relation with any of the attributes
water Has a Negative Association with superplastic and fineagg and there is no other significant relation with any other attributes
superplastic has Negative association with water and Positive Association with Ash and strength buth this relation is not so strong
courseagg Does not have any significiant relation with any of the attributes
age has a very slight positive association with strength
#creating the copy of orginal data set
data1=data.copy()
data1.boxplot(figsize=(40,20));
_, bp = data1.boxplot(return_type='both', figsize=(20,10), rot='vertical')
fliers = [flier.get_ydata() for flier in bp["fliers"]]
boxes = [box.get_ydata() for box in bp["boxes"]]
caps = [cap.get_ydata() for cap in bp['caps']]
whiskers = [whiskers.get_ydata() for whiskers in bp["whiskers"]]
for idx, col in enumerate(data1.columns):
print('Number of outliers in ',col, '-', len(fliers[idx]))
for idx, col in enumerate(data1.columns):
q1 = data1[col].quantile(0.25)
q3 = data1[col].quantile(0.75)
low = q1 - 1.5*(q3 - q1)
high = q3 + 1.5*(q3 - q1)
data1.loc[(data1[col] < low), col] = caps[idx * 2][0]
data1.loc[(data1[col] > high), col] = caps[idx * 2 + 1][0]
# Check the dataset after Outlier treatment
# Check the dataset after Outlier treatment
fig = plt.figure(figsize = (15, 7.2))
ax = sns.boxplot(data = data1.iloc[:, 0:18], orient = 'h')
#Standardise the data
from scipy.stats import zscore
data_scaled=data1.apply(zscore)
data_scaled
#Creating a Covariance Matrix
cov_matrix=np.cov(data_scaled.T)
print('Covariance Matrix \n%s',cov_matrix)
#Calculate Eigen Values & Eigen Vectors
e_vals,e_vecs=np.linalg.eig(cov_matrix)
print('Eigen Values \n%s' %e_vals)
print('\nEigen Vectors \n%s'%e_vecs)
# the "cumulative variance explained" analysis
tot=sum(e_vals)
var_exp=[(i/tot)*100 for i in sorted(e_vals,reverse=True)]
cum_var_exp=np.cumsum(var_exp)
print("Cumulative Variance Explained",cum_var_exp)
# Plotting the variance expalained by the principal components and the cumulative variance explained.
plt.figure(figsize=(15,10))
plt.axhline(y=95, color='r', linestyle=':')
plt.bar(range(1, e_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
# Create a new matrix using the n components
pca=PCA(n_components=6).fit_transform(data_scaled)
#Converting PCA Transformed data from Array to Dataframe to visualise in the pairplot
pca_df=pd.DataFrame(pca)
sns.pairplot(pca_df,diag_kind='kde')
X=data_scaled.drop(['strength'],axis=1)
y=data_scaled.strength
X.head()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,random_state=1)
print("Shape of Training Data :",X_train.shape)
print("Shape of Testing Data :",X_test.shape)
lr_model=LinearRegression()
lr_model.fit(X_train,y_train)
lr_trainscore=lr_model.score(X_train,y_train)
lr_testscore=lr_model.score(X_test,y_test)
#coefficients
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, lr_model.coef_[idx]))
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
ridge_trainscore=ridge.score(X_train,y_train)
ridge_testscore=ridge.score(X_test,y_test)
print ("Ridge model:", (ridge.coef_))
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
lasso_trainscore=lasso.score(X_train,y_train)
lasso_testscore=lasso.score(X_test,y_test)
print ("Lasso model:", (lasso.coef_))
score={'Train Score' : {'Regression' : lr_trainscore,
'Ridge':ridge_trainscore,
'Lasso':lasso_trainscore},
'Test Score':{'Regression' : lr_testscore,
'Ridge':ridge_testscore,
'Lasso':lasso_testscore}}
score_df=pd.DataFrame(score)
score_df
results = pd.DataFrame({'Method':['Regression'], 'Train Accuracy': lr_trainscore,'Test Accuracy':lr_testscore},index={'1'})
tempresultsdf=pd.DataFrame({'Method':['Ridge'], 'Train Accuracy': ridge_trainscore,'Test Accuracy':ridge_testscore},index={'2'})
results=pd.concat([results,tempresultsdf])
tempresultsdf=pd.DataFrame({'Method':['Lasso'], 'Train Accuracy': lasso_trainscore,'Test Accuracy':lasso_testscore},index={'3'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
poly=PolynomialFeatures(degree = 2, interaction_only=True)#quadratic with degree 2
X_poly=poly.fit_transform(X)
Xpoly_train,Xpoly_test,ypoly_train,ypoly_test=train_test_split(X_poly,y,test_size=.30,random_state=1)
Xpoly_train.shape
lr_model.fit(Xpoly_train,ypoly_train)
print(lr_model.coef_)
#Train accuracy score
lr_model.score(Xpoly_train,ypoly_train)
#Test accuracy score
lr_model.score(Xpoly_test,ypoly_test)
ridge.fit(Xpoly_train,ypoly_train)#ridge model
print ("Ridge model:", (ridge.coef_))
lasso.fit(Xpoly_train,ypoly_train)#lasso model
print ("Lasso model:", (lasso.coef_))
print("Ridge Scores: ")
print("train score : ", ridge.score(Xpoly_train,ypoly_train))
print("test score : ", ridge.score(Xpoly_test,ypoly_test))
print()
print("Lasso Scores:")
print("train score : ", lasso.score(Xpoly_train,ypoly_train))
print("test score : ", lasso.score(Xpoly_test,ypoly_test))
Now, we will use K-Means clustering to group data based on their attribute. First, we need to determine the optimal number of groups. For that we conduct the knee test to see where the knee happens.
### Finding Optimal no of clusters
from scipy.spatial.distance import cdist
clusters=range(1,10)
meanDistortions=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(data_scaled)
prediction=model.predict(data_scaled)
meanDistortions.append(sum(np.min(cdist(data_scaled, model.cluster_centers_, 'euclidean'), axis=1)) / data_scaled.shape[0])
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
#Lets try with 3 groups
kmeans_model=KMeans(3)
kmeans_model.fit(data_scaled)
prediction=kmeans_model.predict(data_scaled)
kmeans_df=data_scaled.copy()#creating copy
kmeans_df["Group"]=prediction
kmeans_df.head()
clusters = kmeans_df.groupby(["Group"])
clusters.mean()
kmeans_df.boxplot(by='Group',layout=(3,3),figsize=(15,10));
#cement vs strength
with sns.axes_style("white"):
plot=sns.lmplot('cement','strength',data=kmeans_df,hue='Group')
plot.set(ylim=(-3,3))
#slag vs cement
with sns.axes_style("white"):
plot=sns.lmplot('slag','strength',data=kmeans_df,hue='Group')
plot.set(ylim=(-3,3))
#ash vs strength
with sns.axes_style("white"):
plot=sns.lmplot('ash','strength',data=kmeans_df,hue='Group')
plot.set(ylim=(-3,3))
#superplastic vs strength
with sns.axes_style("white"):
plot=sns.lmplot('superplastic','strength',data=kmeans_df,hue='Group')
plot.set(ylim=(-3,3))
#coarseagg vs strength
with sns.axes_style("white"):
plot=sns.lmplot('coarseagg','strength',data=kmeans_df,hue='Group')
plot.set(ylim=(-3,3))
#fineagg vs strength
with sns.axes_style("white"):
plot=sns.lmplot('fineagg','strength',data=kmeans_df,hue='Group')
plot.set(ylim=(-3,3))
#water vs strength
with sns.axes_style("white"):
plot=sns.lmplot('water','strength',data=kmeans_df,hue='Group')
plot.set(ylim=(-3,3))
#age vs strength
with sns.axes_style("white"):
plot=sns.lmplot('age','strength',data=kmeans_df,hue='Group')
plot.set(ylim=(-3,3))
dt=DecisionTreeRegressor()
dt.fit(X_train,y_train)
pd.DataFrame(dt.feature_importances_, index = data_scaled.columns[:-1],
columns=['Importance']).sort_values('Importance',ascending=False).plot(kind='bar',color='c', figsize=(15,7), title='Feature Importance of Decision Tree')
#creating copy of the dataset
data_dt=data_scaled.copy()
X=data_dt.drop(['strength','ash','fineagg','coarseagg'],axis=1)
y=data_dt['strength']
# Spliting X&y into training and testing set in the ratio 70:30
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,random_state=1)
dt2=DecisionTreeRegressor()
dt2.fit(X_train,y_train)
y_pred=dt2.predict(X_test)
dt_trainacc=dt2.score(X_train,y_train)
dt_testacc=dt2.score(X_test,y_test)
print('Training Accuracy (DT) : ' ,dt_trainacc)
print('Testing Accuract (DT): ',dt_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_pred, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Decision Tree'], 'Train Accuracy': dt_trainacc,'Test Accuracy':dt_testacc},index={'4'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
param_grid = {'max_depth': np.arange(3, 6),
'criterion' : ['mse','mae'],
'max_leaf_nodes': [100, 105, 90, 95],
'min_samples_split': [6, 7, 8, 9, 10],
'max_features':[2, 3, 4, 5, 6]}
grid_tree_dt = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 10, scoring= 'r2')
grid_tree_dt.fit(X_train, y_train)
print(grid_tree_dt.best_estimator_)
print('Best Score:', np.abs(grid_tree_dt.best_score_))
# invoking the decision tree classifier function#criterion = 'mse'
dt3 = DecisionTreeRegressor(criterion = 'mae',max_depth=5,min_samples_split=6,max_leaf_nodes=105,max_features=4)
dt3.fit(X_train, y_train)
y_preddt=dt3.predict(X_test)
dt3_trainacc=dt3.score(X_train,y_train)
dt3_testacc=dt3.score(X_test,y_test)
print('Training Accuracy (DT Hyperparameter tuning) : ' ,dt3_trainacc)
print('Testing Accuracy (DT yperparameter tuning): ',dt3_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_preddt, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Decision Tree with Hypeparameter Tuning'], 'Train Accuracy': dt3_trainacc,'Test Accuracy':dt3_testacc},index={'5'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
#creating copy of the dataset
data_rf=data_scaled.copy()
X=data_rf.drop(['strength'],axis=1)
y=data_rf['strength']
# Spliting X&y into training and testing set in the ratio 70:30
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,random_state=1)
rf=RandomForestRegressor()
rf.fit(X_train, y_train)
pd.DataFrame(rf.feature_importances_, index = data_rf.columns[:-1],
columns=['Importance']).sort_values('Importance',ascending=False).plot(kind='bar',color='r', figsize=(15,7), title='Feature Importance of Random Forest')
X=data_rf.drop(['strength','ash','fineagg','coarseagg'],axis=1)
y=data_rf['strength']
# Spliting X&y into training and testing set in the ratio 70:30
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,random_state=1)
rf2=RandomForestRegressor()
rf2.fit(X_train,y_train)
y_pred=rf2.predict(X_test)
rf_trainacc=rf2.score(X_train,y_train)
rf_testacc=rf2.score(X_test,y_test)
print('Training Accuracy (RF) : ' ,rf_trainacc)
print('Testing Accuract (RF): ',rf_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_pred, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Random Forest'], 'Train Accuracy': rf_trainacc,'Test Accuracy':rf_testacc},index={'6'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
param_grid = {'max_depth': np.arange(3, 8),
'criterion' : ['mse','mae'],
'max_leaf_nodes': [100, 105, 90, 95],
'min_samples_split': [6, 7, 8, 9, 10],
'max_features':['auto','sqrt','log2']}
grid_tree_rf = GridSearchCV(RandomForestRegressor(), param_grid, cv = 5, scoring= 'r2')
grid_tree_rf.fit(X_train, y_train)
print(grid_tree_rf.best_estimator_)
print('Best Score:', np.abs(grid_tree_rf.best_score_))
# invoking the decision tree classifier function#criterion = 'mse'
rf3 = RandomForestRegressor(criterion = 'mse',max_leaf_nodes=90,max_depth=7,min_samples_split=6,)
rf3.fit(X_train, y_train)
y_predrf=rf3.predict(X_test)
rf3_trainacc=rf3.score(X_train,y_train)
rf3_testacc=rf3.score(X_test,y_test)
print('Training Accuracy (RF Hyperparameter tuning) : ' ,rf3_trainacc)
print('Testing Accuracy (RF yperparameter tuning): ',rf3_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_predrf, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Random Forest with Hypeparameter Tuning'], 'Train Accuracy': rf3_trainacc,'Test Accuracy':rf3_testacc},index={'7'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
#creating copy of the dataset
data_gb=data_scaled.copy()
X=data_gb.drop(['strength'],axis=1)
y=data_gb['strength']
# Spliting X&y into training and testing set in the ratio 70:30
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,random_state=1)
gb=GradientBoostingRegressor()
gb.fit(X_train, y_train)
pd.DataFrame(gb.feature_importances_, index = data_gb.columns[:-1],
columns=['Importance']).sort_values('Importance',ascending=False).plot(kind='bar',color='g', figsize=(15,7), title='Feature Importance of Gradient Boost Regressor')
X=data_gb.drop(['strength','ash','fineagg','coarseagg'],axis=1)
y=data_gb['strength']
# Spliting X&y into training and testing set in the ratio 70:30
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,random_state=1)
gb2=GradientBoostingRegressor()
gb2.fit(X_train,y_train)
y_pred=gb2.predict(X_test)
gb_trainacc=gb2.score(X_train,y_train)
gb_testacc=gb2.score(X_test,y_test)
print('Training Accuracy (GB) : ' ,gb_trainacc)
print('Testing Accuract (GB): ',gb_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_pred, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Gradient Boost'], 'Train Accuracy': gb_trainacc,'Test Accuracy':gb_testacc},index={'8'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
param_grid = {'n_estimators': [100, 200, 250, 500],
'max_depth': range(10, 31, 2),
'min_samples_split': range(50, 501, 10),
'learning_rate':[0.1, 0.2]}
clf = GridSearchCV(GradientBoostingRegressor(random_state = 1)
, param_grid, cv = 5, scoring= 'r2').fit(X_train, y_train)
print(clf.best_estimator_)
print('Best Score:', clf.best_score_)
# invoking the decision tree classifier function#criterion = 'mse'
gb3 = GradientBoostingRegressor(max_depth=18, min_samples_split=140, n_estimators=500,
random_state=1)
gb3.fit(X_train, y_train)
y_predgb=gb3.predict(X_test)
gb3_trainacc=gb3.score(X_train,y_train)
gb3_testacc=gb3.score(X_test,y_test)
print('Training Accuracy (GB Hyperparameter tuning) : ' ,gb3_trainacc)
print('Testing Accuracy (GB yperparameter tuning): ',gb3_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_predgb, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Gradient Boost with Hypeparameter Tuning'], 'Train Accuracy': gb3_trainacc,'Test Accuracy':gb3_testacc},index={'9'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
#creating copy of the dataset
data_ab=data_scaled.copy()
X=data_ab.drop(['strength'],axis=1)
y=data_ab['strength']
# Spliting X&y into training and testing set in the ratio 70:30
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,random_state=1)
ab=AdaBoostRegressor()
ab.fit(X_train, y_train)
pd.DataFrame(ab.feature_importances_, index = data_ab.columns[:-1],
columns=['Importance']).sort_values('Importance',ascending=False).plot(kind='bar',color='y', figsize=(15,7), title='Feature Importance of Ada Boost Regressor')
X=data_ab.drop(['strength','ash','fineagg','coarseagg'],axis=1)
y=data_ab['strength']
# Spliting X&y into training and testing set in the ratio 70:30
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,random_state=1)
ab2=AdaBoostRegressor()
ab2.fit(X_train,y_train)
y_pred=ab2.predict(X_test)
ab_trainacc=ab2.score(X_train,y_train)
ab_testacc=ab2.score(X_test,y_test)
print('Training Accuracy (AB) : ' ,ab_trainacc)
print('Testing Accuract (AB): ',ab_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_pred, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Ada Boost'], 'Train Accuracy': ab_trainacc,'Test Accuracy':ab_testacc},index={'10'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
param_grid = {'n_estimators': [100, 200, 250, 500],
'loss' : ['linear', 'square', 'exponential'],
'learning_rate':[0.1, 0.2]}
clf = GridSearchCV(AdaBoostRegressor(random_state = 1)
, param_grid, cv = 5, scoring= 'r2').fit(X_train, y_train)
print(clf.best_estimator_)
print('Best Score:', clf.best_score_)
# invoking the decision tree classifier function#criterion = 'mse'
ab3 = AdaBoostRegressor(learning_rate=0.2, loss='square', n_estimators=500,
random_state=1)
ab3.fit(X_train, y_train)
y_predab=ab3.predict(X_test)
ab3_trainacc=ab3.score(X_train,y_train)
ab3_testacc=ab3.score(X_test,y_test)
print('Training Accuracy (AB Hyperparameter tuning) : ' ,ab3_trainacc)
print('Testing Accuracy (AB yperparameter tuning): ',ab3_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_predab, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Ada Boost Boost with Hypeparameter Tuning'], 'Train Accuracy': ab3_trainacc,'Test Accuracy':ab3_testacc},index={'11'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
br=BaggingRegressor()
br.fit(X_train,y_train)
y_pred=br.predict(X_test)
br_trainacc=br.score(X_train,y_train)
br_testacc=br.score(X_test,y_test)
print('Training Accuracy (BR) : ' ,br_trainacc)
print('Testing Accuract (BR): ',br_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_pred, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Bagging Regressor'], 'Train Accuracy': br_trainacc,'Test Accuracy':br_testacc},index={'12'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
param_grid = {'n_estimators': [100, 200, 250, 500],
'max_features':[2,4,6,8]}
clf = GridSearchCV(BaggingRegressor(random_state = 1)
, param_grid, cv = 5, scoring= 'r2').fit(X_train, y_train)
print(clf.best_estimator_)
print('Best Score:', clf.best_score_)
# invoking the decision tree classifier function#criterion = 'mse'
br2 = BaggingRegressor(max_features=4, n_estimators=100, random_state=1)
br2.fit(X_train, y_train)
y_predgb=br2.predict(X_test)
br2_trainacc=br2.score(X_train,y_train)
br2_testacc=br2.score(X_test,y_test)
print('Training Accuracy (BR Hyperparameter tuning) : ' ,br2_trainacc)
print('Testing Accuracy (BR yperparameter tuning): ',br2_testacc )
sns.set(style="darkgrid", color_codes=True)
with sns.axes_style("white"):
sns.jointplot(x=y_test, y=y_predgb, stat_func=pearsonr,kind="reg", color="k");
tempresultsdf=pd.DataFrame({'Method':['Bagging Regressor with Hypeparameter Tuning'], 'Train Accuracy': br2_trainacc,'Test Accuracy':br2_testacc},index={'13'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
error=[]
for i in range(1,30):
knn=KNeighborsRegressor(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i!=y_test))
plt.figure(figsize=(12,6))
plt.plot(range(1,30),error,color='red', linestyle='dashed',marker='o',markerfacecolor='blue',markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean error')
#k=3
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)
y_predknn=knn.predict(X_test)
knn_trainacc=knn.score(X_train,y_train)
knn_testacc=knn.score(X_test,y_test)
print('Training Accuracy (KNN ) : ' ,knn_trainacc)
print('Testing Accuracy (BKNN ): ',knn_testacc )
tempresultsdf=pd.DataFrame({'Method':['KNN'], 'Train Accuracy': knn_trainacc,'Test Accuracy':knn_testacc},index={'13'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
param_grid = {'n_neighbors' :range(1, 21, 2),
'weights' :['uniform','distance'],
'metric' : ['euclidean', 'manhattan', 'minkowski']}
clf = GridSearchCV(KNeighborsRegressor()
, param_grid, cv = 5, scoring= 'r2').fit(X_train, y_train)
print(clf.best_estimator_)
print('Best Score:', clf.best_score_)
# invoking the decision tree classifier function#criterion = 'mse'
knn2 = KNeighborsRegressor(metric='euclidean', weights='distance')
knn2.fit(X_train, y_train)
y_predgb=knn2.predict(X_test)
knn2_trainacc=knn2.score(X_train,y_train)
knn2_testacc=knn2.score(X_test,y_test)
print('Training Accuracy (knn Hyperparameter tuning) : ' ,knn2_trainacc)
print('Testing Accuracy (knn yperparameter tuning): ',knn2_testacc )
tempresultsdf=pd.DataFrame({'Method':['KNN with Hypeparameter Tuning'], 'Train Accuracy': knn2_trainacc,'Test Accuracy':knn2_testacc},index={'14'})
results=pd.concat([results,tempresultsdf])
results = results[['Method', 'Train Accuracy','Test Accuracy']]
results
results=results.sort_values("Test Accuracy",ascending=False)
fig=plt.figure(figsize=(15,10))
ax=sns.barplot(y="Method", x=("Test Accuracy"),data=results)
total = len(results["Test Accuracy"])
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_width())
x = p.get_x() + p.get_width() + 0.02
y = p.get_y() + p.get_height()/2
ax.annotate(percentage, (x, y))
plt.show()
data_scaled.head()
values=data_scaled.values
from sklearn.metrics import r2_score
# Number of bootstrap samples to create
n_iterations = 1000 # Number of bootstrap samples to create
n_size = int(len(data_scaled) * 0.50) # picking only 50 % of the given data in every bootstrap sample
# run bootstrap
stats = list()
for i in range(n_iterations):
# prepare train and test sets
train = resample(values, n_samples=n_size) # Sampling with replacement
test = np.array([x for x in values if x.tolist() not in train.tolist()]) # picking rest of the data not considered in sample
# fit model
model = GradientBoostingRegressor(random_state=1, max_depth=12,
min_samples_split=100, n_estimators=200,
learning_rate=0.2)
model.fit(train[:,:-1], train[:,-1])
# evaluate model
predictions = model.predict(test[:,:-1])
score = r2_score(test[:,-1], predictions) # caution, overall accuracy score can mislead when classes are imbalanced
print(score)
stats.append(score)
# plot scores
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95 # for 95% confidence
p = ((1.0-alpha)/2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100));
# Number of bootstrap samples to create
n_iterations = 1000 # Number of bootstrap samples to create
n_size = int(len(data_scaled) * 0.50) # picking only 50 % of the given data in every bootstrap sample
# run bootstrap
stats = list()
for i in range(n_iterations):
# prepare train and test sets
train = resample(values, n_samples=n_size) # Sampling with replacement
test = np.array([x for x in values if x.tolist() not in train.tolist()]) # picking rest of the data not considered in sample
# fit model
model = RandomForestRegressor(n_estimators=100)
model.fit(train[:,:-1], train[:,-1])
# evaluate model
predictions = model.predict(test[:,:-1])
score = r2_score(test[:,-1], predictions) # caution, overall accuracy score can mislead when classes are imbalanced
print(score)
stats.append(score)
# plot scores
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95 # for 95% confidence
p = ((1.0-alpha)/2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))
Feature importance were calculated using decision trees, Random forest regressor, Grandient boost regressor and Ada boost regressor.most of them shown Age & Cement as the Important Feature
Outliers where Identified using boxplot and was replaced by median